1 Objective

  • check the new regression data (comp_reg_dt.rmd)
    • report the summary statistics and visualize the data

1.1 Question

  • usage contains inf. usage <= 40 is used to filter out such observations, but is it correct? Why usage <= 40 is selected although there are some observations whose usage is more than 40?



2 Data Exploration

2.1 Loading data

# /*===== load packages =====*/
source(here("GitControlled/Codes/0_1_ls_packages.R"))

# /*===== load data =====*/
# /*---- newly updated data ----*/
reg_data <- readRDS(here("Shared/Data/WaterAnalysis/comp_reg_dt.rds")) %>%
  .[year %in% 2008:2012 & usage <= 40,]

# /*---- newly updated aggregated data ()----*/
# reg_agg_data <- readRDS(here("/Shared/Data/WaterAnalysis/agg_comp_reg_dt.rds"))

# /*---- old data ----*/
# var_ls_old <- c('precip_in', 'tmin_in', 'tmax_in', 'gdd_in', 'sand_pct', 'silt_pct', 'clay_pct', 'slope', 'kv', 'awc','usage','treat2', 'tr', 'year', 'nrdname')

# reg_data_old <- readRDS(here("/Shared/Data/WaterAnalysis/data_w_LR_TB.rds")) %>%
#   .[,..var_ls_old]



2.2 Weather and Soil variables: Site-Level

  • remove tmmn_in and tmmx_in from regression anlaysis.
    • they are included in gdd
# /*===== preparation =====*/
# /*---- reg_data ----*/
gene_vars <- c("nrdname", "year")
weather_var <- c("pr_in", "pet_in", "gdd_in")
soil_var <- c("sandtotal_r", "claytotal_r", "silttotal_r", "ksat_r", "awc_r", "slope_r")

reg_long_dt <- reg_data %>%
    .[,c(gene_vars, weather_var, soil_var), with=FALSE] %>%
    melt(id.vars = gene_vars)

 
# /*=================================================*/
#' # Summary statistics 
# /*=================================================*/
reg_data[,c("nrdname", weather_var, soil_var), with = FALSE] %>%
    datasummary(
        formula = as.formula( paste(paste0(c(weather_var, soil_var), collapse = "+"), "nrdname * (Mean + SD + MinMax)",  sep="~")),
        output = "gt", data = .
        )
Lower Republican Tri-Basin
Mean SD MinMax Mean SD MinMax
pr_in 19.67 5.52 [8.09, 30.96] 19.69 5.43 [8.13, 29.85]
pet_in 40.26 5.66 [33.39, 50.94] 40.06 5.67 [33.06, 50.7]
gdd_in 1773.40 151.52 [1528.15, 2044.3] 1758.25 155.30 [1504, 2007.93]
sandtotal_r 15.11 9.52 [5.01, 74.95] 12.18 4.23 [5.52, 71.88]
claytotal_r 22.20 2.86 [8.71, 35.23] 23.27 2.26 [9.48, 35.38]
silttotal_r 62.73 7.39 [16.35, 67.37] 64.57 3.18 [18.64, 67.42]
ksat_r 8.56 4.76 [4.18, 61.92] 7.56 2.52 [4.35, 56.41]
awc_r 0.21 0.01 [0.12, 0.22] 0.21 0.01 [0.13, 0.22]
slope_r 3.60 2.78 [0, 15.93] 2.96 2.68 [0.06, 18.28]
  • unit:
    • pr_in(inch)
    • tmmn_in and tmmx_in(celsius)
    • pet_in(inch)
    • sandtotal_r, claytotal_r, silttotal_r (%)
    • ksat_r (um/s)
    • awc_r (cm/cm)
    • slope_r (%)


# /*=================================================*/
#' # Quick Visualization
# /*=================================================*/
# /*===== Comparison: Treatment and Control groups =====*/
# /*---- weather  ----*/
ggplot(reg_long_dt[variable %in% weather_var,]) +
    geom_boxplot(aes(x=factor(year), y=value, fill=nrdname)) +
    facet_grid(variable ~ . , scale='free') +
    theme(
        legend.position = "bottom",
        title = element_text(face = "bold"),
        strip.text.y = element_text(face = "bold")
        ) +
    ggtitle("Annual Variations in Climate Variables")

# /*---- soil  ----*/
ggplot(reg_long_dt[variable %in% soil_var,]) +
    geom_histogram(aes(x=value))+
    facet_grid(nrdname ~ variable ,scale='free') +
    ggtitle("Histogram by Soil Variables") +
    theme(
        title = element_text(face = "bold"),
        strip.text.y = element_text(face = "bold"),
        strip.text.x = element_text(face = "bold")
        )

  • Similar variations in the weather and soil variables between treatment and control group.



2.3 Scatterplot matrix

# /*=================================================*/
#' # Correlation Matrix
# /*=================================================*/
# /*---- weather ----*/
GGally::ggpairs(
    data = reg_data[,..weather_var],
    # lower = list(continuous = wrap("points", size=0.5)),
    diag= list(continuous="barDiag"),
    title = "Scatterplot matrix of the Weather Data"
    )



# # /*---- soil ----*/
ggpairs(
    data = reg_data[,..soil_var],
    # lower = list(continuous = wrap("points", color= "blue", size=0.5)),
    diag=list(continuous="barDiag"),
    title = "Scatterplot matrix of the Soil Data"
    )